A public data set of spatio-temporal match events in soccer competitions¶
Luca Pappalardo, Paolo Cintia, Alessio Rossi, Emanuele Massucco, Paolo Ferragina, Dino Pedreschi & Fosca Giannotti
Nature Scientific Data 6, Article number: 236 (2019)
if you use this code or the plots generated from it, please cite/mention the following papers:
Pappalardo, L., Cintia, P., Rossi, A. et al. A public data set of spatio-temporal match events in soccer competitions. Sci Data 6, 236 (2019) doi:10.1038/s41597-019-0247-7, https://www.nature.com/articles/s41597-019-0247-7
Pappalardo, L., Cintia, P., Ferragina, P., Massucco, E., Pedreschi, D., Giannotti, F. (2019) PlayeRank: Data-driven Performance Evaluation and Player Ranking in Soccer via a Machine Learning Approach. ACM Transactions on Intellingent Systems and Technologies 10(5) Article 59, DOI: https://doi.org/10.1145/3343172, https://dl.acm.org/citation.cfm?id=3343172
and the data collection on figshare:
- Pappalardo, Luca; Massucco, Emanuele (2019): Soccer match event dataset. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.4415000
Import library¶
Here we import all the library useful to create plots.
import json
from collections import Counter
import numpy as np
import operator
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib.patches import Ellipse
import seaborn as sns
import pandas as pd
import networkx as nx
import base64
from collections import defaultdict
import sys,os
import math
import random
import operator
import csv
import matplotlib.pylab as pyl
import itertools
import scipy as sp
from scipy import stats
from scipy import optimize
from scipy.integrate import quad
import warnings
warnings.filterwarnings('ignore')
Import data sets¶
Here we import the events data sets, the match data set, the players data set and the competition data set download the the figshare collection (find here the link to download the data: https://www.nature.com/articles/s41597-019-0247-7)
# loading the events data
events={}
nations = ['Italy','England','Germany','France','Spain','European_Championship','World_Cup']
for nation in nations:
with open('./data/events/events_%s.json' %nation) as json_data:
events[nation] = json.load(json_data)
# loading the match data
matches={}
nations = ['Italy','England','Germany','France','Spain','European_Championship','World_Cup']
for nation in nations:
with open('./data/matches/matches_%s.json' %nation) as json_data:
matches[nation] = json.load(json_data)
# loading the players data
players={}
with open('./data/players.json') as json_data:
players = json.load(json_data)
# loading the competitions data
competitions={}
with open('./data/competitions.json') as json_data:
competitions = json.load(json_data)
converted_list = []
for nation, event in events.items():
for e in event:
e['nation'] = nation
converted_list.append(e)
df_events = pd.DataFrame(converted_list)
df_events['x'] = df_events['positions'].apply(lambda pos: pos[0]['x'])
df_events['y'] = df_events['positions'].apply(lambda pos: 100 - pos[0]['y'])
df_events
| eventId | subEventName | tags | playerId | positions | matchId | eventName | teamId | matchPeriod | eventSec | subEventId | id | nation | x | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | Simple pass | [{'id': 1801}] | 8327 | [{'y': 52, 'x': 49}, {'y': 44, 'x': 43}] | 2575959 | Pass | 3158 | 1H | 2.530536 | 85 | 180423957 | Italy | 49 | 48 |
| 1 | 8 | Simple pass | [{'id': 1801}] | 20438 | [{'y': 44, 'x': 43}, {'y': 17, 'x': 36}] | 2575959 | Pass | 3158 | 1H | 3.768418 | 85 | 180423958 | Italy | 43 | 56 |
| 2 | 7 | Touch | [] | 8306 | [{'y': 17, 'x': 36}, {'y': 56, 'x': 78}] | 2575959 | Others on the ball | 3158 | 1H | 4.868265 | 72 | 180423959 | Italy | 36 | 83 |
| 3 | 1 | Ground attacking duel | [{'id': 504}, {'id': 703}, {'id': 1801}] | 8306 | [{'y': 56, 'x': 78}, {'y': 15, 'x': 64}] | 2575959 | Duel | 3158 | 1H | 8.114676 | 11 | 180423960 | Italy | 78 | 44 |
| 4 | 1 | Ground attacking duel | [{'id': 503}, {'id': 703}, {'id': 1801}] | 8306 | [{'y': 15, 'x': 64}, {'y': 15, 'x': 72}] | 2575959 | Duel | 3158 | 1H | 8.647892 | 11 | 180423961 | Italy | 64 | 85 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3251289 | 8 | Simple pass | [{'id': 1801}] | 3476 | [{'y': 20, 'x': 46}, {'y': 6, 'x': 64}] | 2058017 | Pass | 9598 | 2H | 2978.301867 | 85 | 263885652 | World_Cup | 46 | 80 |
| 3251290 | 7 | Touch | [] | 14812 | [{'y': 6, 'x': 64}, {'y': 2, 'x': 82}] | 2058017 | Others on the ball | 9598 | 2H | 2979.084611 | 72 | 263885653 | World_Cup | 64 | 94 |
| 3251291 | 8 | Cross | [{'id': 401}, {'id': 801}, {'id': 1802}] | 14812 | [{'y': 2, 'x': 82}, {'y': 100, 'x': 100}] | 2058017 | Pass | 9598 | 2H | 2983.448628 | 80 | 263885654 | World_Cup | 82 | 98 |
| 3251292 | 4 | Goalkeeper leaving line | [] | 25381 | [{'y': 0, 'x': 0}, {'y': 98, 'x': 18}] | 2058017 | Goalkeeper leaving line | 4418 | 2H | 2985.869275 | 40 | 263885613 | World_Cup | 0 | 100 |
| 3251293 | 8 | Launch | [{'id': 1802}] | 25381 | [{'y': 43, 'x': 14}, {'y': 0, 'x': 0}] | 2058017 | Pass | 4418 | 2H | 3002.148765 | 84 | 263885618 | World_Cup | 14 | 57 |
3251294 rows × 15 columns
for item in players:
if 'role' in item and 'name' in item['role']:
item['role'] = item['role']['name']
players_df = pd.DataFrame(players)
players_df
| passportArea | weight | firstName | middleName | lastName | currentTeamId | birthDate | height | role | birthArea | wyId | foot | shortName | currentNationalTeamId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | {'name': 'Turkey', 'id': '792', 'alpha3code': ... | 78 | Harun | Tekin | 4502 | 1989-06-17 | 187 | Goalkeeper | {'name': 'Turkey', 'id': '792', 'alpha3code': ... | 32777 | right | H. Tekin | 4687 | |
| 1 | {'name': 'Senegal', 'id': '686', 'alpha3code':... | 73 | Malang | Sarr | 3775 | 1999-01-23 | 182 | Defender | {'name': 'France', 'id': '250', 'alpha3code': ... | 393228 | left | M. Sarr | 4423 | |
| 2 | {'name': 'France', 'id': '250', 'alpha3code': ... | 72 | Over | Mandanda | 3772 | 1998-10-26 | 176 | Goalkeeper | {'name': 'France', 'id': '250', 'alpha3code': ... | 393230 | O. Mandanda | null | ||
| 3 | {'name': 'Senegal', 'id': '686', 'alpha3code':... | 82 | Alfred John Momar | N'Diaye | 683 | 1990-03-06 | 187 | Midfielder | {'name': 'France', 'id': '250', 'alpha3code': ... | 32793 | right | A. N'Diaye | 19314 | |
| 4 | {'name': 'France', 'id': '250', 'alpha3code': ... | 84 | Ibrahima | Konat\u00e9 | 2975 | 1999-05-25 | 192 | Defender | {'name': 'France', 'id': '250', 'alpha3code': ... | 393247 | right | I. Konat\u00e9 | null | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3598 | {'name': 'Tunisia', 'id': 788, 'alpha3code': '... | 72 | Ali | Ma\u00e2loul | 16041 | 1990-01-01 | 175 | Defender | {'name': 'Tunisia', 'id': 788, 'alpha3code': '... | 120839 | left | A. Ma\u00e2loul | null | |
| 3599 | {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... | 76 | Carlos Alberto | C\u00e1ceda Oyaguez | 15591 | 1991-09-27 | 183 | Goalkeeper | {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... | 114736 | right | C. C\u00e1ceda | null | |
| 3600 | {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... | 78 | Miguel Gianpierre | Araujo Blanco | 12072 | 1994-10-24 | 179 | Defender | {'name': 'Peru', 'id': 604, 'alpha3code': 'PER... | 114908 | right | M. Araujo | null | |
| 3601 | {'name': 'Morocco', 'id': 504, 'alpha3code': '... | 70 | Ahmed Reda | Tagnaouti | 16183 | 1996-04-05 | 182 | Goalkeeper | {'name': 'Morocco', 'id': 504, 'alpha3code': '... | 285583 | right | A. Tagnaouti | null | |
| 3602 | {'name': 'Panama', 'id': 591, 'alpha3code': 'P... | 0 | Ricardo | Guardia Avila | 62943 | 1997-02-04 | 0 | Midfielder | {'name': 'Panama', 'id': 591, 'alpha3code': 'P... | 361536 | left | R. Avila | null |
3603 rows × 14 columns
merged_df = pd.merge(df_events, players_df, left_on='playerId', right_on='wyId', how='inner')
merged_df=merged_df[['eventId', 'eventName', 'playerId', 'subEventName', 'tags', 'role', 'nation', 'x', 'y']]
events_to_keep = ['Pass', 'Duel', 'Shot', 'Others on the ball', 'Free Kick']
merged_df_preprocessed1 = merged_df[merged_df['eventName'].isin(events_to_keep)]
def determine_category1(row):
if row['eventName'] == 'Others on the ball':
if row['subEventName'].lower() == 'acceleration':
return 'Others on the ball - Acceleration'
elif row['subEventName'].lower() == 'clearance':
return 'Others on the ball - Clearance'
else:
return 'Drop Row'
else:
return row['eventName']
merged_df_preprocessed1['category1'] = merged_df_preprocessed1.apply(determine_category1, axis=1)
merged_df_preprocessed1
| eventId | eventName | playerId | subEventName | tags | role | nation | x | y | category1 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | Pass | 8327 | Simple pass | [{'id': 1801}] | Forward | Italy | 49 | 48 | Pass |
| 1 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 602}, {'id': 703}, {'id': 1801}] | Forward | Italy | 72 | 75 | Duel |
| 2 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 701}, {'id': 1802}] | Forward | Italy | 82 | 36 | Duel |
| 3 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 702}, {'id': 1801}] | Forward | Italy | 71 | 71 | Duel |
| 4 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 602}, {'id': 701}, {'id': 1802}] | Forward | Italy | 72 | 26 | Duel |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3025251 | 8 | Pass | 70379 | Simple pass | [{'id': 1801}] | Midfielder | World_Cup | 71 | 4 | Pass |
| 3025252 | 1 | Duel | 70379 | Ground attacking duel | [{'id': 502}, {'id': 703}, {'id': 1801}] | Midfielder | World_Cup | 78 | 10 | Duel |
| 3025253 | 8 | Pass | 70379 | Cross | [{'id': 401}, {'id': 801}, {'id': 1802}] | Midfielder | World_Cup | 76 | 20 | Pass |
| 3025254 | 8 | Pass | 70379 | Simple pass | [{'id': 1801}] | Midfielder | World_Cup | 70 | 32 | Pass |
| 3025255 | 8 | Pass | 70379 | Simple pass | [{'id': 1801}] | Midfielder | World_Cup | 75 | 3 | Pass |
2942586 rows × 10 columns
def determine_category2(row):
if row['eventName'] == 'Pass':
if row['role'] == 'Forward':
return 'Pass Forward'
elif row['role'] == 'Midfielder':
return 'Pass Midfielder'
else:
return 'Pass Defender'
else:
return row['eventName']
merged_df_preprocessed1['category2'] = merged_df_preprocessed1.apply(determine_category2, axis=1)
merged_df_preprocessed1
| eventId | eventName | playerId | subEventName | tags | role | nation | x | y | category1 | category2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8 | Pass | 8327 | Simple pass | [{'id': 1801}] | Forward | Italy | 49 | 48 | Pass | Pass Forward |
| 1 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 602}, {'id': 703}, {'id': 1801}] | Forward | Italy | 72 | 75 | Duel | Duel |
| 2 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 701}, {'id': 1802}] | Forward | Italy | 82 | 36 | Duel | Duel |
| 3 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 702}, {'id': 1801}] | Forward | Italy | 71 | 71 | Duel | Duel |
| 4 | 1 | Duel | 8327 | Ground attacking duel | [{'id': 602}, {'id': 701}, {'id': 1802}] | Forward | Italy | 72 | 26 | Duel | Duel |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3025251 | 8 | Pass | 70379 | Simple pass | [{'id': 1801}] | Midfielder | World_Cup | 71 | 4 | Pass | Pass Midfielder |
| 3025252 | 1 | Duel | 70379 | Ground attacking duel | [{'id': 502}, {'id': 703}, {'id': 1801}] | Midfielder | World_Cup | 78 | 10 | Duel | Duel |
| 3025253 | 8 | Pass | 70379 | Cross | [{'id': 401}, {'id': 801}, {'id': 1802}] | Midfielder | World_Cup | 76 | 20 | Pass | Pass Midfielder |
| 3025254 | 8 | Pass | 70379 | Simple pass | [{'id': 1801}] | Midfielder | World_Cup | 70 | 32 | Pass | Pass Midfielder |
| 3025255 | 8 | Pass | 70379 | Simple pass | [{'id': 1801}] | Midfielder | World_Cup | 75 | 3 | Pass | Pass Midfielder |
2942586 rows × 11 columns
Functions¶
Here we create all the functions usefull to create the plots.
merged_df_preprocessed1=merged_df_preprocessed1.sample(300000)
merged_df_preprocessed1.category1.unique()
array(['Duel', 'Pass', 'Drop Row', 'Free Kick',
'Others on the ball - Clearance', 'Shot',
'Others on the ball - Acceleration'], dtype=object)
#Pass, Shot, Duel, Free Kick, Others on the ball--Acceleration, Others on the ball--Clearance
merged_df_preprocessed1.columns
Index(['eventId', 'eventName', 'playerId', 'subEventName', 'tags', 'role',
'nation', 'x', 'y', 'category1', 'category2'],
dtype='object')
import altair as alt
import pandas as pd
alt.data_transformers.disable_max_rows()
def draw_pitch_altair():
pitch_elements = []
pitch_elements.append({'start': [0, 0], 'end': [100, 0]})
pitch_elements.append({'start': [100, 0], 'end': [100, 100]})
pitch_elements.append({'start': [100, 100], 'end': [0, 100]})
pitch_elements.append({'start': [0, 100], 'end': [0, 0]})
pitch_elements.append({'start': [50, 0], 'end': [50, 100]})
pitch_elements.append({'start': [17, 25], 'end': [17, 75]})
pitch_elements.append({'start': [17, 25], 'end': [0, 25]})
pitch_elements.append({'start': [17, 75], 'end': [0, 75]})
pitch_elements.append({'start': [100, 25], 'end': [83, 25]})
pitch_elements.append({'start': [83, 25], 'end': [83, 75]})
pitch_elements.append({'start': [100, 75], 'end': [83, 75]})
center_circle = alt.Chart(pd.DataFrame({'x': [50], 'y': [50]})).mark_circle(
size=3000,
stroke='black',
strokeWidth=2,
fill=None
).encode(
x='x:Q',
y='y:Q'
).properties(
width=600,
height=400
)
pitch_df = pd.DataFrame(pitch_elements)
lines = alt.Chart(pitch_df).mark_rule(strokeWidth=2).encode(
x='start[0]:Q',
y='start[1]:Q',
x2='end[0]:Q',
y2='end[1]:Q',
color=alt.value('black')
)
return alt.layer(lines, center_circle).properties(width=700, height=500)
def create_heatmap(df, category, colors):
heatmap = alt.Chart(df.query(f"category1 == '{category}'")).mark_rect().encode(
alt.X('x:Q', bin=alt.Bin(maxbins=100)),
alt.Y('y:Q', bin=alt.Bin(maxbins=100)),
color=alt.Color('count()', scale=alt.Scale(scheme=colors)),
).properties(width=700, height=500)
return heatmap
nation_selector = alt.selection_single(
fields=['nation'],
name="Select Nation",
bind=alt.binding_select(options=[None] + sorted(list(merged_df_preprocessed1['nation'].unique()))),
empty='all'
)
more_detail_selector = alt.selection_single(
fields=['category2'],
name="See More Detail",
bind=alt.binding_select(options=[None, 'Pass Defender', 'Pass Midfielder', 'Pass Forward']),
empty='all'
)
def create_interactive_heatmap(df, category, colors, detail_selector=None):
heatmap = create_heatmap(df, category, colors)
filtered_heatmap = heatmap.transform_filter(
nation_selector
)
if detail_selector:
filtered_heatmap = filtered_heatmap.transform_filter(
detail_selector
)
final_chart = alt.layer(
filtered_heatmap,
draw_pitch_altair()
).add_selection(
nation_selector
).properties(
width=700,
height=500,
title=f"{category} Events by Nation"
)
if category == 'Pass':
final_chart = final_chart.add_selection(
more_detail_selector
)
return final_chart
final_charts = []
activity_types = ['Shot', 'Pass', 'Duel', 'Free Kick', 'Others on the ball - Acceleration', 'Others on the ball - Clearance']
for activity in activity_types:
if activity == 'Pass':
heatmap = create_interactive_heatmap(merged_df_preprocessed1, activity, 'greens', more_detail_selector)
else:
heatmap = create_interactive_heatmap(merged_df_preprocessed1, activity, 'greens')
final_chart = heatmap.properties(width=700, height=500, title=activity)
final_charts.append(final_chart)
for i in range(len(final_charts)):
final_charts[i].display()